Retrieve the list of biggest world population

In [1]:
import pandas as pd
import sys
sys.path.append('..')
from data.unlabeled import WORLD_CITIES as wct
In [2]:
wct.head()
Out[2]:
city city_ascii lat lng country iso2 iso3 admin_name capital population id
0 Tokyo Tokyo 35.6897 139.6922 Japan JP JPN Tōkyō primary 37977000.0 1392685764
1 Jakarta Jakarta -6.2146 106.8451 Indonesia ID IDN Jakarta primary 34540000.0 1360771077
2 Delhi Delhi 28.6600 77.2300 India IN IND Delhi admin 29617000.0 1356872604
3 Mumbai Mumbai 18.9667 72.8333 India IN IND Mahārāshtra admin 23355000.0 1356226629
4 Manila Manila 14.5958 120.9772 Philippines PH PHL Manila primary 23088000.0 1608618140

Dataset normalization

In [3]:
wct.isnull().sum()
Out[3]:
city              0
city_ascii        0
lat               0
lng               0
country           0
iso2             31
iso3              0
admin_name       76
capital       18943
population      973
id                0
dtype: int64
In [4]:
len(wct[wct.columns].drop_duplicates()), len(wct[wct.columns]) # no duplicates
Out[4]:
(26569, 26569)
In [5]:
wct = wct.drop(columns=["city", "iso2", "iso3", "admin_name", "capital", "id"])
In [6]:
wct.columns
Out[6]:
Index(['city_ascii', 'lat', 'lng', 'country', 'population'], dtype='object')
In [7]:
wct = wct.rename(columns={'city_ascii':'city'})
In [8]:
wct.columns
Out[8]:
Index(['city', 'lat', 'lng', 'country', 'population'], dtype='object')

Missing values

In [9]:
to_drop = wct[wct.population.isnull()]
to_drop
Out[9]:
city lat lng country population
824 Al Quds 31.7764 35.2269 West Bank NaN
827 Ngerulmud 7.5006 134.6242 Palau NaN
6255 Un'goofaaru 5.6681 73.0302 Maldives NaN
6393 Banqiao 25.0143 121.4672 Taiwan NaN
7568 Naifaru 5.4442 73.3662 Maldives NaN
... ... ... ... ... ...
9469 We -20.9000 167.2667 New Caledonia NaN
9470 Presevo 42.3067 21.6500 Serbia NaN
9471 Bujanovac 42.4667 21.7667 Serbia NaN
9472 Kitamilo 0.2222 33.2061 Uganda NaN
9473 Tarrafal 15.2833 -23.7667 Cabo Verde NaN

973 rows × 5 columns

In [10]:
# dropping missing values from the dataset
wctc = wct.copy(deep=False) 
wctc.dropna(inplace=True)
In [11]:
wctc.isnull().sum() # cleaned dataset
Out[11]:
city          0
lat           0
lng           0
country       0
population    0
dtype: int64

Look at the position of the cities with missing data in a map to see if some country is not represented

We need to verify if the null values that have been dropped are randomly distributed or there's a hidden pattern. In this way we see if all geographic areas are represented.

In [12]:
import geojson
import folium
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-12-6a09eb20f703> in <module>
----> 1 import geojson
      2 import folium

ModuleNotFoundError: No module named 'geojson'
In [14]:
data_to_plot = list(wctc.columns[3:5])
In [15]:
def df_to_geojson(df, properties, lat='latitude', lon='longitude'):
    """
    Turn a dataframe containing point data into a geojson formatted python dictionary
    
    df : the dataframe to convert to geojson
    properties : a list of columns in the dataframe to turn into geojson feature properties
    lat : the name of the column in the dataframe that contains latitude data
    lon : the name of the column in the dataframe that contains longitude data
    """
    
    # create a new python dict to contain our geojson data, using geojson format
    geojson = {'type':'FeatureCollection', 'features':[]}

    # loop through each row in the dataframe and convert each row to geojson format
    for _, row in df.iterrows():
        # create a feature template to fill in
        feature = {'type':'Feature',
                   'properties':{},
                   'geometry':{'type':'Point',
                               'coordinates':[]}}

        # fill in the coordinates
        feature['geometry']['coordinates'] = [row[lon],row[lat]]

        # for each column, get the value and add it as a new feature property
        for prop in properties:
            feature['properties'][prop] = row[prop]
        
        # add this feature (aka, converted dataframe row) to the list of features inside our dict
        geojson['features'].append(feature)
    
    return geojson
In [16]:
geo = df_to_geojson(to_drop, data_to_plot, lat = "lat", lon = "lng")
In [17]:
m = folium.Map([9,9], zoom_start=2)

folium.GeoJson(geo).add_to(m)

# uncomment below to see the map
# m
Out[17]:
<folium.features.GeoJson at 0x7f905d316a30>
In [18]:
to_drop[to_drop["country"]== "Malta"] # all null values
Out[18]:
city lat lng country population
7901 Sliema 35.9125 14.5019 Malta NaN
8144 Fgura 35.8703 14.5133 Malta NaN
8151 Hamrun 35.8847 14.4844 Malta NaN
8238 Senglea 35.8875 14.5169 Malta NaN
8264 Tarxien 35.8658 14.5150 Malta NaN
... ... ... ... ... ...
8883 Santa Lucija 36.0431 14.2172 Malta NaN
8935 Zebbug 36.0722 14.2358 Malta NaN
8946 Imgarr 35.9206 14.3664 Malta NaN
8971 Gharb 36.0600 14.2089 Malta NaN
9035 San Lawrenz 36.0556 14.2036 Malta NaN

65 rows × 5 columns

It looks like some countries have more missing information than others.

Retrieve the world biggest cities

A big city is categorized as >500,000 residents in the given year, 2020.

In [19]:
big_cities = wctc.sort_values(by=["population"], ascending=False).head(1000)
In [20]:
big_cities.head()
Out[20]:
city lat lng country population
0 Tokyo 35.6897 139.6922 Japan 37977000.0
1 Jakarta -6.2146 106.8451 Indonesia 34540000.0
2 Delhi 28.6600 77.2300 India 29617000.0
3 Mumbai 18.9667 72.8333 India 23355000.0
4 Manila 14.5958 120.9772 Philippines 23088000.0
In [21]:
big_cities.to_csv('bigcities.csv',index=False)
In [22]:
geo2 = df_to_geojson(big_cities, data_to_plot, lat = "lat", lon = "lng")
m2 = folium.Map([9,9], zoom_start=2)
folium.GeoJson(geo2).add_to(m2)

# uncomment below to see the map
# m2
Out[22]:
<folium.features.GeoJson at 0x7f905e015a30>

We can see that some countries are not represented with this approach. For example, the African State of Namibia.

Retrieve the most populated cities (6 max) for every country

In [23]:
countries = list(wctc["country"].unique())
top_cities = pd.DataFrame(columns = wctc.columns)
In [24]:
top_cities = top_cities.append([wct[wct["country"] == country].sort_values(by=["population"], ascending=False).head(6) for country in countries], ignore_index = True)
In [25]:
top_cities
Out[25]:
city lat lng country population
0 Tokyo 35.6897 139.6922 Japan 37977000.0
1 Osaka 34.6936 135.5019 Japan 14977000.0
2 Nagoya 35.1167 136.9333 Japan 9113000.0
3 Yokohama 35.4333 139.6333 Japan 3748781.0
4 Fukuoka 33.5903 130.4019 Japan 2128000.0
... ... ... ... ... ...
1131 Grand Turk 21.4664 -71.1360 Turks And Caicos Islands 5801.0
1132 Avarua -21.2070 -159.7710 Cook Islands 5445.0
1133 Vatican City 41.9000 12.4478 Vatican City 825.0
1134 Stanley -51.7000 -57.8500 Falkland Islands (Islas Malvinas) 2213.0
1135 Grytviken -54.2806 -36.5080 South Georgia And South Sandwich Islands 99.0

1136 rows × 5 columns

In [26]:
top_cities.to_csv('bigcities_allcountries.csv',index=False)
In [27]:
geo3 = df_to_geojson(top_cities, data_to_plot, lat = "lat", lon = "lng")
m3 = folium.Map([9, 9], zoom_start=2)
folium.GeoJson(geo3).add_to(m3)

# uncomment below to see the map
m3
Out[27]:
Make this Notebook Trusted to load map: File -> Trust Notebook

In this way every country should be represented. Probably also those containing a lot of cities that have been dropped.